# read in relevant libraries
library(data.table)
data.table 1.10.2
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
Warning message:
R graphics engine version 12 is not supported by this version of RStudio. The Plots tab will be disabled until a newer version of RStudio is installed.
library(igraph)
Attaching package: ‘igraph’
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:base’:
union
library(recommenderlab)
Loading required package: Matrix
Loading required package: arules
Attaching package: ‘arules’
The following objects are masked from ‘package:base’:
abbreviate, write
Loading required package: proxy
Attaching package: ‘proxy’
The following object is masked from ‘package:Matrix’:
as.matrix
The following objects are masked from ‘package:stats’:
as.dist, dist
The following object is masked from ‘package:base’:
as.matrix
Loading required package: registry
Attaching package: ‘recommenderlab’
The following objects are masked from ‘package:igraph’:
normalize, similarity
library(ggplot2)
# set random seed
set.seed(23495)
The primary rating data was prepped in a separate code file, sampled, and and stored. The cleaned data is directly imported here for convenience.
# read in rating data
rating.dt <- fread("netflix_sampled_data.csv", header=TRUE) # data for 2004, min. 100 user reviews and 100 movie ratings
# get data on movie names
## adjusted movie title names slightly directly in csv file prior to import
movies.dt <- fread("movie_titles_aws.csv", header=FALSE, col.names=c("MovieID", "Title"))
# number of unique movies
cat("Number of unique movies:", length(unique(rating.dt$MovieID)))
Number of unique movies: 6177
# number of users
cat("\nNumber of users who provided ratings:",length(unique(rating.dt$CustomerID)))
Number of users who provided ratings: 51374
# number of total ratings
cat("\nNumber of total ratings:",nrow(rating.dt))
Number of total ratings: 8467727
# average and median user rating
cat("Average rating across data:",mean(rating.dt$Rating))
Average rating across data: 3.409843
cat("\nMedian rating across data:",median(rating.dt$Rating))
Median rating across data: 3
# distribution of average rating by user (indicating lack of uniformity)
# color options: #AA2B2B #9D2E2E ##98141D --> pptx dark red
avg.ratings <- rating.dt[, .(AvgRating=mean(Rating)), by=CustomerID]
ggplot(avg.ratings, aes(x=AvgRating)) +
geom_histogram(binwidth=0.2, col="gray", fill="#9D2E2E") +
labs(x="Average User Rating", y="Number of Users", title="Distribution of Average User Rating") + theme_minimal() +
theme(text=element_text(family="Roboto"),
plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("Average rating histogram.png", width=7, height=5)
# distribution of movie rating by user
# color options: #AA2B2B #9D2E2E
ggplot(rating.dt, aes(x=Rating)) +
geom_histogram(binwidth=0.3, col="gray", fill="#9D2E2E") +
labs(x="User Rating of a Movie", y="Number of Users", title="Distribution of User Ratings") +
theme_minimal() +
theme(text=element_text(family="Roboto"),
plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("Rating Histogram.png", width=7, height=5)
# look at number of movies rated by user
# get table with number of movies rated by each user
user.ratings <- rating.dt[, .("NumRated"=.N), by=CustomerID]
# average number of movies rated
cat("Average number of movies rated:",mean(user.ratings$NumRated))
Average number of movies rated: 164.8251
# median number of movies rated
cat("\nMedian number of movies rated:",median(user.ratings$NumRated))
Median number of movies rated: 108
# max number of movvies rated
cat("\nMax number of movies rated:",max(user.ratings$NumRated))
Max number of movies rated: 5163
A number of users had very high movie ratings, e.g. in the 4000s that would imply 10+ movies seen per day on average. This may be due to multiple individuals sharing an account, or due to the use of on-site surveys to get ratings of movies a user saw in the past.
# get Top10 movies with highest number of ratings
movies.info <- rating.dt[, .("NumberofRatings"=.N, "AvgRating"=mean(Rating)), by=MovieID]
movies.info <- merge(movies.info, movies.dt, by="MovieID")
# get Top10 movies with highest number of ratings
print("Movies with highest number of ratings")
[1] "Movies with highest number of ratings"
head(movies.info[order(-NumberofRatings)]$Title, 10)
[1] "My Big Fat Greek Wedding" "Catch Me If You Can"
[3] "Two Weeks Notice" "Sweet Home Alabama"
[5] "Minority Report" "Road to Perdition"
[7] "Signs" "Harry Potter and the Chamber of Secrets"
[9] "The Bourne Identity" "Lord of the Rings: The Two Towers"
# get Top10 movies with highest average rating
print("Movies with highest average ratings")
[1] "Movies with highest average ratings"
head(movies.info[order(-AvgRating)]$Title, 10)
[1] "Lord of the Rings: The Return of the King" "City of God"
[3] "Alias: Season 2" "Raiders of the Lost Ark"
[5] "CSI: Season 2" "24: Season 2"
[7] "CSI: Season 1" "Family Guy: Vol. 2: Season 3"
[9] "The Sopranos: Season 2" "Alias: Season 1"
# Full plot
ggplot(movies.info, aes(x=NumberofRatings, y=AvgRating)) +
geom_smooth(method="loess", se=F, col="#9D2E2E", size=1.1) +
labs(title="Average Rating versus Movie Degree Centrality", x="Movie Degree Centrality (in bipartite network)", y="Average Rating") +
theme_minimal() + scale_x_continuous(breaks=seq(0,60000,10000)) +
theme(text=element_text(family="Roboto"),
plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("Average rating vs degree.png", width=7, height=5)
# Zoomed In Plot
ggplot(movies.info[NumberofRatings<3000,], aes(x=NumberofRatings, y=AvgRating)) +
geom_smooth(method="loess", se=F, col="#9D2E2E", size=1.1) +
labs(title="Zoomed in: Average Rating versus Movie Degree Centrality", x="Movie Degree Centrality (in bipartite network)", y="Average Rating") +
theme_minimal() + scale_x_continuous(breaks=seq(0,3000,500)) +
theme(text=element_text(family="Roboto"),
plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("Average rating vs degree_ZOOMED.png", width=7, height=5)
# Linear Regression on movies with 1500 or fewer ratings
regress.dt <- movies.info[NumberofRatings <= 1000,] # create subset of movies with 1500 or fewer ratings
setnames(regress.dt, "NumberofRatings", "Degree") # change name to degree
summary(lm(AvgRating~Degree, data=regress.dt)) # regression on subset
Call:
lm(formula = AvgRating ~ Degree, data = regress.dt)
Residuals:
Min 1Q Median 3Q Max
-1.68326 -0.30549 0.00245 0.31939 1.61046
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.112e+00 1.200e-02 259.40 < 2e-16 ***
Degree 1.785e-04 2.898e-05 6.16 7.91e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4661 on 4409 degrees of freedom
Multiple R-squared: 0.008534, Adjusted R-squared: 0.008309
F-statistic: 37.95 on 1 and 4409 DF, p-value: 7.908e-10
summary(lm(AvgRating~NumberofRatings, data=movies.info)) # regression on all data
Call:
lm(formula = AvgRating ~ NumberofRatings, data = movies.info)
Residuals:
Min 1Q Median 3Q Max
-1.70834 -0.29869 0.00949 0.32091 1.62039
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.188e+00 6.524e-03 488.66 <2e-16 ***
NumberofRatings 3.462e-05 2.202e-06 15.72 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4546 on 6175 degrees of freedom
Multiple R-squared: 0.0385, Adjusted R-squared: 0.03835
F-statistic: 247.3 on 1 and 6175 DF, p-value: < 2.2e-16
# distribution of number of movies rated by user, limit axis to 1000+ movies
plot.ratings <- user.ratings[,.(NumRated = ifelse(NumRated>=1000, 1000, NumRated))]
ggplot(plot.ratings, aes(x=NumRated)) +
geom_histogram(binwidth=8, col="gray", fill="#9D2E2E") +
labs(x="Number of Movies Rated", y="Number of Users", title="Distribution of Number of Movies Rated") +
theme_minimal() + scale_x_continuous(breaks=seq(100,1000,100),
labels=c("100","200","300","400","500","600","700","800","900", "1000+")) +
#scale_y_continuous(breaks=seq(0,60000,10000)) +
theme(text=element_text(family="Roboto"),
plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("number of movies rated histogram.png", width=7, height=5)
# prep for making network
rating.dt[,CustomerID := sub("^", "u", CustomerID )]
rating.dt[, MovieID := as.character(MovieID)]
# make bipartite graph
graph.bp <- graph.data.frame(rating.dt[,1:2], directed=FALSE) # make general undirected graph
V(graph.bp)$type <- V(graph.bp)$name %in% rating.dt$MovieID # specify type to make bipartite
E(graph.bp)$weight <- rating.dt$Rating # add in rating as weight
# look at graph
graph.bp
IGRAPH UNWB 57551 8467727 --
+ attr: name (v/c), type (v/l), weight (e/n)
+ edges (vertex names):
[1] u656399 --3 u1436762--3 u1644750--3 u616720 --3 u1614320--3 u115498 --3 u699878 --3 u2519847--3
[9] u948069 --3 u67315 --3 u603277 --3 u1859725--3 u283774 --3 u1813349--3 u6689 --3 u109089 --3
[17] u525003 --3 u2312349--3 u1977959--3 u21983 --3 u2173816--3 u78931 --3 u2145227--3 u958104 --3
[25] u489962 --3 u206809 --3 u1007809--3 u1562675--3 u1477923--3 u44783 --3 u52540 --3 u870391 --3
[33] u2164676--3 u1281996--3 u2646060--3 u709342 --3 u1658752--3 u2266857--3 u1456369--3 u104768 --3
[41] u1355097--3 u1231910--3 u2599552--3 u153249 --3 u2590630--3 u203667 --3 u2338873--3 u719833 --3
[49] u2003554--3 u2213289--3 u2630072--3 u1614895--3 u1221390--3 u2193643--3 u357507 --3 u1599030--3
[57] u2443370--3 u871580 --3 u1733406--3 u309567 --3 u2096587--3 u290951 --3 u1213801--3 u1045221--3
+ ... omitted several edges
# Visualize graph layout
bp.subplot <- induced_subgraph(graph.bp,v=sample(unlist(V(graph.bp)$name), 7500))
# define color and shape mappings.
col <- c("gray85", "#9D2E2E")
shape <- c("circle", "square")
plot(bp.subplot,
vertex.color = col[as.numeric(V(bp.subplot)$type)+1],
vertex.shape = shape[as.numeric(V(bp.subplot)$type)+1], layout=layout_as_bipartite(bp.subplot, hgap=30),
vertex.frame.color="gray60",
edge.color = "#E5AAAA",
vertex.label="", vertex.size=5)
# make bipartite graph on movie-movie network
graph.bp2 <- graph.data.frame(rating.dt[,2:1], directed=FALSE) # make general undirected graph
V(graph.bp2)$type <- V(graph.bp2)$name %in% rating.dt$CustomerID # specify type to make bipartite
# look at graph
graph.bp2
IGRAPH UN-B 57551 8467727 --
+ attr: name (v/c), type (v/l)
+ edges (vertex names):
[1] 3--u656399 3--u1436762 3--u1644750 3--u616720 3--u1614320 3--u115498 3--u699878 3--u2519847
[9] 3--u948069 3--u67315 3--u603277 3--u1859725 3--u283774 3--u1813349 3--u6689 3--u109089
[17] 3--u525003 3--u2312349 3--u1977959 3--u21983 3--u2173816 3--u78931 3--u2145227 3--u958104
[25] 3--u489962 3--u206809 3--u1007809 3--u1562675 3--u1477923 3--u44783 3--u52540 3--u870391
[33] 3--u2164676 3--u1281996 3--u2646060 3--u709342 3--u1658752 3--u2266857 3--u1456369 3--u104768
[41] 3--u1355097 3--u1231910 3--u2599552 3--u153249 3--u2590630 3--u203667 3--u2338873 3--u719833
[49] 3--u2003554 3--u2213289 3--u2630072 3--u1614895 3--u1221390 3--u2193643 3--u357507 3--u1599030
[57] 3--u2443370 3--u871580 3--u1733406 3--u309567 3--u2096587 3--u290951 3--u1213801 3--u1045221
+ ... omitted several edges
mov.mtx <- as_incidence_matrix(graph.bp2) # get affiliation matrix from chart
mov.sp.mtx <- as(mov.mtx, "sparseMatrix") # encode as sparse matrix
mov.coaffil.mtx <- tcrossprod(mov.sp.mtx) # get co-affiliation matrix to make movie network
# make movie-movie coaffiliation network
graph.movies <- graph_from_adjacency_matrix(mov.coaffil.mtx, mode="undirected", diag=FALSE, weight=TRUE) # keep diagonals because indicate own rating strength?
# calculate co-affiliation centrality measures
degree.score2 <- degree(graph.movies)
closeness.score2 <- closeness(graph.movies)
eigen.score2 <- eigen_centrality(graph.movies)
movies.ratings.2 <- rating.dt[MovieID %in% V(graph.movies)$name,
.("AvgRating"=mean(Rating)), by=MovieID]
movies.performance.2 <- data.table("MovieID"=V(graph.movies)$name, "Degree"=degree.score2,
"Closeness"=closeness.score2)
movies.performance.2 <- data.table("MovieID"=V(graph.movies)$name, "Degree"=degree.score2,
"Closeness"=closeness.score2, "EigenCentrality"=eigen.score2$vector)
movies.performance.2 <- merge(movies.performance.2, movies.ratings.2, by="MovieID")
# Movie-Movie Degree Centrality
ggplot(movies.performance.2, aes(x=Degree, y=AvgRating)) +
geom_smooth(method="loess", se=F, col="#9D2E2E", size=1.1) +
labs(title="Average Rating versus Movie-Movie Degree Centrality", x="Movie Degree Centrality (in co-affiliation network)", y="Average Rating") +
theme_minimal() +
theme(plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("movie-movie degree vs avg rating.png", width=7, height=5)
# Movie-Movie Closeness Centrality
ggplot(movies.performance.2, aes(x=Closeness, y=AvgRating)) +
geom_smooth(method="loess", se=F, col="#9D2E2E", size=1.1) +
labs(title="Average Rating versus Movie-Movie Closeness Centrality", x="Movie Closeness Centrality (in co-affiliation network)", y="Average Rating") +
theme_minimal() +
theme(text=element_text(family="Roboto"),
plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("movie-movie closeness vs avg rating.png", width=7, height=5)
# # Movie-Movie Eigen Centrality
ggplot(movies.performance.2, aes(x=EigenCentrality, y=AvgRating)) +
geom_smooth(method="loess", se=F, col="#9D2E2E", size=1.1) +
labs(title="Average Rating versus Movie-Movie Eigen Centrality", x="Movie Eigen Centrality (in co-affiliation network)", y="Average Rating") +
theme_minimal() +
theme(text=element_text(family="Roboto"),
plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("movie-movie eigen vs avg rating.png", width=7, height=5)
# how to intepret
summary(lm(AvgRating ~ Degree, data = movies.performance.2)) # degree
Call:
lm(formula = AvgRating ~ Degree, data = movies.performance.2)
Residuals:
Min 1Q Median 3Q Max
-1.7551 -0.3049 0.0101 0.3276 1.5943
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.0413183 0.3881069 10.413 <2e-16 ***
Degree -0.0001308 0.0000630 -2.076 0.0379 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4634 on 6175 degrees of freedom
Multiple R-squared: 0.0006977, Adjusted R-squared: 0.0005359
F-statistic: 4.311 on 1 and 6175 DF, p-value: 0.0379
summary(lm(AvgRating ~ Closeness, data = movies.performance.2)) # closeness
Call:
lm(formula = AvgRating ~ Closeness, data = movies.performance.2)
Residuals:
Min 1Q Median 3Q Max
-1.69906 -0.29266 0.00845 0.31557 1.65569
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.785e+00 2.851e-02 132.75 <2e-16 ***
Closeness -9.606e+03 4.884e+02 -19.67 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4497 on 6175 degrees of freedom
Multiple R-squared: 0.05896, Adjusted R-squared: 0.05881
F-statistic: 386.9 on 1 and 6175 DF, p-value: < 2.2e-16
summary(lm(AvgRating ~ EigenCentrality, data = movies.performance.2)) # EigenCentrality
Call:
lm(formula = AvgRating ~ EigenCentrality, data = movies.performance.2)
Residuals:
Min 1Q Median 3Q Max
-1.69644 -0.29628 0.00849 0.31753 1.62679
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.176876 0.006693 474.67 <2e-16 ***
EigenCentrality 0.785185 0.045570 17.23 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4528 on 6175 degrees of freedom
Multiple R-squared: 0.04587, Adjusted R-squared: 0.04572
F-statistic: 296.9 on 1 and 6175 DF, p-value: < 2.2e-16
# limit user-user network to users with at least 200 ratings
users.sample <- copy(rating.dt)
users.sample <- users.sample[, NumRatings := .N, by=CustomerID]
users.sample <- users.sample[NumRatings >= 200, ]
# create bipartite graph with less data
graph.bp3 <- graph.data.frame(users.sample[,1:2], directed=FALSE) # make general undirected graph
V(graph.bp3)$type <- V(graph.bp3)$name %in% users.sample$MovieID # specify type to make bipartite
E(graph.bp3)$weight <- users.sample$Rating # add in rating as weight
# get coaffiliation matrix from bipartite graph
users.mtx <- as_incidence_matrix(graph.bp3) # get affiliation matrix from chart
users.sp.mtx <- as(users.mtx, "sparseMatrix") # encode as sparse matrix
users.coaffil.mtx <- tcrossprod(users.sp.mtx) # get co-affiliation matrix to make movie network
# make user-user coaffiliation network
graph.users <- graph_from_adjacency_matrix(users.coaffil.mtx, mode="undirected", diag=FALSE, weight=TRUE) # keep diagonals because indicate own rating strength?
# calculate co-affiliation centrality measures
degree.score3 <- degree(graph.users)
closeness.score3 <- closeness(graph.users)
eigen.score3 <- eigen_centrality(graph.users)
movies.ratings.3 <- rating.dt[CustomerID %in% V(graph.users)$name,
.("AvgRating"=mean(Rating)), by=CustomerID]
movies.performance.3 <- data.table("CustomerID"=V(graph.users)$name, "Degree"=degree.score3,
"Closeness"=closeness.score3)
movies.performance.3 <- data.table("CustomerID"=V(graph.users)$name, "Degree"=degree.score3,
"Closeness"=closeness.score3, "EigenCentrality"=eigen.score3$vector)
movies.performance.3 <- merge(movies.performance.3, movies.ratings.3, by="CustomerID")
# User-User Degree Centrality
ggplot(movies.performance.3, aes(x=Degree, y=AvgRating)) +
geom_smooth(method="loess", se=F, col="#9D2E2E", size=1.1) +
labs(title="Average Rating versus User-User Degree Centrality", x="User Degree Centrality (in co-affiliation network)", y="Average Rating") +
theme_minimal() +
theme(plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("user-user degree vs avg rating.png", width=7, height=5)
# User-User Closeness Centrality
ggplot(movies.performance.3, aes(x=Closeness, y=AvgRating)) +
geom_smooth(method="loess", se=F, col="#9D2E2E", size=1.1) +
labs(title="Average Rating versus User-User Closeness Centrality", x="User Closeness Centrality (in co-affiliation network)", y="Average Rating") +
theme_minimal() +
theme(text=element_text(family="Roboto"),
plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("user-user closeness vs avg rating.png", width=7, height=5)
# # Movie-Movie Eigen Centrality
ggplot(movies.performance.3, aes(x=EigenCentrality, y=AvgRating)) +
geom_smooth(method="loess", se=F, col="#9D2E2E", size=1.1) +
labs(title="Average Rating versus User-User Eigen Centrality", x="User Eigen Centrality (in co-affiliation network)", y="Average Rating") +
theme_minimal() +
theme(text=element_text(family="Roboto"),
plot.title=element_text(size=14, hjust=0.5, margin = margin(t = 5, r = 0, b = 8, l = 0)),
axis.title.y = element_text(margin = margin(t = 0, r = 12, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)))
ggsave("user-user eigen vs avg rating.png", width=7, height=5)
# how to intepret
summary(lm(AvgRating ~ Degree, data = movies.performance.3)) # degree
Call:
lm(formula = AvgRating ~ Degree, data = movies.performance.3)
Residuals:
Min 1Q Median 3Q Max
-2.41998 -0.26919 0.00282 0.27910 1.58002
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -248.09644 242.91656 -1.021 0.307
Degree 0.02119 0.02047 1.035 0.301
Residual standard error: 0.4334 on 11867 degrees of freedom
Multiple R-squared: 9.033e-05, Adjusted R-squared: 6.071e-06
F-statistic: 1.072 on 1 and 11867 DF, p-value: 0.3005
summary(lm(AvgRating ~ Closeness, data = movies.performance.3)) # closeness
Call:
lm(formula = AvgRating ~ Closeness, data = movies.performance.3)
Residuals:
Min 1Q Median 3Q Max
-2.36042 -0.26918 0.00278 0.28199 1.64601
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.281e+00 1.664e-02 197.203 <2e-16 ***
Closeness 3.144e+04 3.650e+03 8.613 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4321 on 11867 degrees of freedom
Multiple R-squared: 0.006213, Adjusted R-squared: 0.006129
F-statistic: 74.19 on 1 and 11867 DF, p-value: < 2.2e-16
summary(lm(AvgRating ~ EigenCentrality, data = movies.performance.3)) # EigenCentrality
Call:
lm(formula = AvgRating ~ EigenCentrality, data = movies.performance.3)
Residuals:
Min 1Q Median 3Q Max
-2.41401 -0.26952 0.00205 0.27861 1.57799
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.428648 0.009872 347.320 <2e-16 ***
EigenCentrality -0.035101 0.035585 -0.986 0.324
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4334 on 11867 degrees of freedom
Multiple R-squared: 8.198e-05, Adjusted R-squared: -2.278e-06
F-statistic: 0.973 on 1 and 11867 DF, p-value: 0.324
# get item-movie matrix from directed graph
input.mtx <- as_incidence_matrix(graph.bp, attr="weight", sparse=TRUE)
# store as recommender lab matrix #and normalize
input.mtx <- as(input.mtx, "realRatingMatrix")
# inspect the rating distributions
hist(getRatings(input.mtx))
hist(getRatings(normalize(input.mtx)), breaks=100)
hist(getRatings(normalize(input.mtx, method="Z-score")), breaks=100)
hist(colMeans(input.mtx), breaks=20)
# create a recommender on UBCF
rec.model = Recommender(input.mtx, method = "UBCF")
# get a random user name
test.user <- sample(rating.dt$CustomerID, 1)
# get the top 10 movies recommended for user XX
rowCounts(input.mtx[test.user])
u1887657
1062
# what he rated high
rated.high <- rating.dt[CustomerID==test.user & Rating > 3,]
rec.test.user = predict(rec.model, input.mtx[test.user,], n=10)
rec.compare <- as.numeric(unlist(as(rec.test.user, "list")))
# movies he rated high
high <- movies.dt[MovieID %in% rated.high$MovieID, .(MovieID,Title)]
# movies recommended
recommend <- movies.dt[MovieID %in% rec.compare, .(MovieID,Title)]
high
recommend
# evaluate different methods
eval = evaluationScheme(input.mtx, method="split", train=0.75, given = 20, goodRating = 4)
eval
Evaluation scheme with 20 items given
Method: ‘split’ with 1 run(s).
Training set proportion: 0.750
Good ratings: >=4.000000
Data set: 51374 x 6177 rating matrix of class ‘realRatingMatrix’ with 8467727 ratings.
# algorithms (perform normalization automatically)
algorithms <- list(
"random items" = list(name="RANDOM", param=NULL),
"popular items" = list(name="POPULAR", param=NULL),
"user-based CF" = list(name="UBCF", param=list(nn=50)),
"item-based CF" = list(name="IBCF", param=list(k=50)),
"SVD approximation" = list(name="SVD", param=list(k = 50)))
# evaluate top-N recommendations
results1 <- evaluate(eval, algorithms, type = "topNList", n=c(1, 5, 10, 20, 50))
RANDOM run fold/sample [model time/prediction time]
1 [0.069sec/124.403sec]
POPULAR run fold/sample [model time/prediction time]
1 [1.211sec/4112.694sec]
UBCF run fold/sample [model time/prediction time]
1 [1.228sec/36012.91sec]
IBCF run fold/sample [model time/prediction time]
1 [12666.36sec/16.449sec]
SVD run fold/sample [model time/prediction time]
1 [74.248sec/127.818sec]
# ROC Curve
plot(results1, annotate=c(1,3), legend="bottomright", main="Comparison of ROC curves for 5 recommender methods")
# evaluate ratings prediction
results2 <- evaluate(eval, algorithms, type = "ratings")
RANDOM run fold/sample [model time/prediction time]
1 [0.067sec/54.937sec]
POPULAR run fold/sample [model time/prediction time]
1 [1.013sec/31.599sec]
UBCF run fold/sample [model time/prediction time]
1 [0.914sec/34170.9sec]
IBCF run fold/sample [model time/prediction time]
1 [12895.81sec/12.755sec]
SVD run fold/sample [model time/prediction time]
1 [73.395sec/56.64sec]
# MSE / MAE plot for rating
plot(results2, ylim = c(0,3), main="Comparison of RMSE, MSE, and MAE for 5 recommender methods")